This notebook examines the relationship between different features of the data for distinguishing viral from nonviral sequences.
Please reach out to James Riddell (riddell.26@buckeyemail.osu.edu) or Bridget Hegarty (beh53@case.edu) regarding any issues, or open an issue on github.
library(ggplot2)
library(plyr)
library(reshape2)
library(viridis)
Loading required package: viridisLite
library(tidyr)
Attaching package: ‘tidyr’
The following object is masked from ‘package:reshape2’:
smiths
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:plyr’:
arrange, count, desc, failwith, id, mutate, rename, summarise, summarize
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(readr)
library(data.table)
data.table 1.14.0 using 1 threads (see ?getDTthreads). Latest news: r-datatable.com
**********
This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.
This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.
**********
Attaching package: ‘data.table’
The following objects are masked from ‘package:dplyr’:
between, first, last
The following objects are masked from ‘package:reshape2’:
dcast, melt
viruses <- read_tsv("../IntermediaryFiles/viral_tools_combined.tsv")
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
.default = col_double(),
seqtype = col_character(),
contig = col_character(),
checkv_provirus = col_character(),
checkv_quality = col_character(),
method.x = col_character(),
Classified = col_character(),
IDs_all = col_character(),
Seq = col_character(),
Kaiju_Viral = col_character(),
Kingdom = col_character(),
type = col_character(),
vibrant_quality = col_character(),
method.y = col_character(),
vibrant_prophage = col_character(),
vs2type = col_character(),
max_score_group = col_character()
)
ℹ Use `spec()` for the full column specifications.
colnames(viruses)
[1] "Index" "seqtype" "contig" "checkv_provirus" "checkv_completeness" "checkv_contamination"
[7] "checkv_viral_genes" "checkv_host_genes" "checkv_total_genes" "checkv_length" "checkv_quality" "method.x"
[13] "Classified" "NCBI_taxon" "len" "ID_best" "IDs_all" "Seq"
[19] "Kaiju_Viral" "Kingdom" "score" "pvalue" "bh_pvalue" "type"
[25] "vibrant_quality" "method.y" "vibrant_prophage" "category" "vs2type" "dsDNAphage"
[31] "ssDNA" "NCLDV" "RNA" "lavidaviridae" "max_score" "max_score_group"
[37] "hallmark" "viral" "cellular" "percent_host" "percent_viral" "percent_unknown"
There were 25 warnings (use warnings() to see them)
plot(viruses$RNA, viruses$viral)
There were 50 or more warnings (use warnings() to see the first 50)
plot(viruses$lavidaviridae, viruses$viral)
plot(viruses$NCLDV, viruses$viral)
plot(viruses$ssDNA, viruses$viral)
plot(viruses$checkv_completeness, viruses$hallmark)
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
ggplot(viruses, aes(x=checkv_host_genes, y=checkv_viral_genes)) +
There were 34 warnings (use warnings() to see them)
geom_hex(bins = 30) +
scale_fill_continuous(type = "viridis", trans="log10") +
theme_bw() +
facet_wrap(~seqtype, scales = "free") +
xlab("Number of Host Genes") +
ylab("Number of Viral Genes")
ggplot(viruses, aes(x=hallmark, y=checkv_length)) +
There were 32 warnings (use warnings() to see them)
geom_hex(bins = 30) +
scale_fill_continuous(type = "viridis", trans="log10") +
theme_bw() +
facet_wrap(~seqtype, scales = "free") +
xlab("Number of Hallmark Genes") +
ylab("Length of Sequence")
ggplot(viruses, aes(x=checkv_length, y=checkv_completeness)) +
geom_hex(bins = 30) +
scale_fill_continuous(type = "viridis", trans="log10") +
theme_bw() +
facet_wrap(~seqtype, scales = "free") +
xlab("Length") +
ylab("Completeness")
ggplot(viruses, aes(x=hallmark, y=checkv_completeness)) +
There were 30 warnings (use warnings() to see them)
geom_hex(bins = 30) +
scale_fill_continuous(type = "viridis", trans="log10") +
theme_bw() +
facet_wrap(~seqtype, scales = "free") +
xlab("Hallmark Genes") +
ylab("Completeness")
table(viruses$seqtype[viruses$checkv_length>50000 & viruses$hallmark==0])/table(viruses$seqtype)
archaea bacteria fungi plasmid protist virus
0.3314556 0.2962900 0.5605600 0.4841683 0.0168000 0.0184000
table(viruses$seqtype[((viruses$checkv_viral_genes*3) <= viruses$checkv_host_genes) & viruses$checkv_provirus=="No"])/table(viruses$seqtype)
archaea bacteria fungi plasmid protist virus
0.9842068 0.8948808 0.9263542 0.8298597 0.7198000 0.0210000
table(viruses$seqtype[viruses$checkv_viral_genes==0 & viruses$checkv_host_genes>=1])/table(viruses$seqtype)
archaea bacteria fungi plasmid protist virus
0.8565537 0.6711065 0.2422398 0.4166333 0.0306000 0.0057000
table(viruses$seqtype[viruses$percent_viral>=50])/table(viruses$seqtype)
archaea bacteria fungi plasmid protist virus
0.000201187 0.027797951 0.006695070 0.004208417 0.053200000 0.306600000
table(viruses$seqtype[viruses$percent_unknown>=75])/table(viruses$seqtype)
archaea bacteria fungi plasmid protist virus
0.11628609 0.08322388 0.82349361 0.45511022 0.85360000 0.39280000
table(viruses$seqtype[viruses$percent_unknown>=75 & viruses$checkv_length<50000])/table(viruses$seqtype)
archaea bacteria fungi plasmid protist virus
0.10552258 0.08021076 0.34814364 0.25430862 0.83540000 0.25300000
table(viruses$seqtype[viruses$hallmark>2])/table(viruses$seqtype[viruses$seqtype %in% unique(viruses$seqtype[viruses$hallmark>2])])
archaea bacteria plasmid virus
0.008952822 0.045428558 0.055711423 0.576000000
table(viruses$seqtype, viruses$Kaiju_Viral)
seqdata <- data.frame(seqtype=viruses$seqtype[!duplicated(viruses$contig)])
rownames(seqdata) <- viruses$contig[!duplicated(viruses$contig)]
library(phyloseq)
features_table <- viruses[viruses$Index==1,]
features_table <- features_table[,colnames(features_table) %in% c(
"checkv_viral_genes",
"checkv_host_genes",
"checkv_unknown_genes",
"checkv_length",
"checkv_completeness",
"checkv_total_genes",
"percent_host",
"percent_viral",
"hallmark",
"percent_unknown"
)]
features_table[is.na(features_table)] <- 0
ft_colnames <- colnames(features_table)
features_table <- t(features_table)
rownames(features_table) <- ft_colnames
colnames(features_table) <- rownames(seqdata)
physeq_pooled <- phyloseq(otu_table(features_table, taxa_are_rows = T))
ordination <- phyloseq::ordinate(physeq =physeq_pooled, method = "PCoA", distance = "bray")
phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
shape="numtools", color="num_viruses") +
geom_point(size = 3) +
theme_bw() +
geom_label(label=seqdata$toolcombo)
phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
shape="numtools", color="num_viruses") +
geom_point(size = 3) +
theme_bw()
use of each tuning rule
keep_score <- rep(0, nrow(viruses))
There were 44 warnings (use warnings() to see them)
keep_score[viruses$hallmark>2] <- keep_score[viruses$hallmark>2] + 1
table(keep_score, viruses$seqtype)
keep_score archaea bacteria fungi plasmid protist virus
0 9852 61777 1643 4712 5000 4240
1 89 2940 0 278 0 5760
keep_score[viruses$checkv_host_genes>50 & viruses$checkv_provirus=="No"] <- keep_score[viruses$checkv_host_genes>50 & viruses$checkv_provirus=="No"] - 1
table(keep_score, viruses$seqtype)
keep_score archaea bacteria fungi plasmid protist virus
-1 2290 14794 492 949 7 0
0 7580 47164 1151 3776 4993 4240
1 71 2759 0 265 0 5760
keep_score[viruses$percent_unknown>=75 & viruses$checkv_length<50000] <- keep_score[viruses$percent_unknown>=75 & viruses$checkv_length<50000] + 0.5
table(keep_score, viruses$seqtype)
keep_score archaea bacteria fungi plasmid protist virus
-1 2290 14794 492 949 7 0
0 6535 41993 579 2526 816 2868
0.5 1045 5171 572 1250 4177 1372
1 67 2739 0 246 0 4602
1.5 4 20 0 19 0 1158
keep_score[viruses$percent_viral>=50] <- keep_score[viruses$percent_viral>=50] + 0.5
table(keep_score, viruses$seqtype)
keep_score archaea bacteria fungi plasmid protist virus
-1 2290 14794 492 949 7 0
0 6533 41078 568 2525 550 1220
0.5 1047 6086 583 1251 4443 3020
1 67 1855 0 226 0 3184
1.5 4 904 0 39 0 2576
#keep_score[viruses$hallmark>=(viruses$checkv_viral_genes/5)] <- keep_score[viruses$hallmark>=(viruses$checkv_viral_genes/5)] + 1 #add some ratio
keep_score[viruses$checkv_viral_genes==0 & viruses$checkv_host_genes>=1] <- keep_score[viruses$checkv_viral_genes==0 & viruses$checkv_host_genes>=1] - 1
table(keep_score, viruses$seqtype)
keep_score archaea bacteria fungi plasmid protist virus
-2 1654 6446 14 357 0 0
-1 6922 43146 724 1609 121 15
-0.5 574 2186 138 705 39 41
0 248 6281 322 1508 436 1205
0.5 473 3901 445 546 4404 2980
1 66 1854 0 226 0 3184
1.5 4 903 0 39 0 2575
keep_score[((viruses$checkv_viral_genes*3) <= viruses$checkv_host_genes) & viruses$checkv_provirus=="No"] <- keep_score[((viruses$checkv_viral_genes*3) <= viruses$checkv_host_genes) & viruses$checkv_provirus=="No"] - 1 # consider accounting for provirus designation
table(keep_score, viruses$seqtype)
keep_score archaea bacteria fungi plasmid protist virus
-3 1654 6446 14 357 0 0
-2 6922 43146 723 1603 121 15
-1.5 574 2186 138 705 39 41
-1 192 3786 256 1116 54 23
-0.5 442 2341 392 360 3385 131
0 56 2504 67 404 382 1182
0.5 31 1560 53 186 1019 2849
1 66 1845 0 220 0 3184
1.5 4 903 0 39 0 2575
# keep_score[(viruses$checkv_viral_genes*3) <= viruses$checkv_host_genes] <- 0 # consider accounting for provirus designation
keep_score[viruses$checkv_length>500000 & viruses$hallmark==0] <- keep_score[viruses$checkv_length>500000 & viruses$hallmark==0] - 1
table(keep_score, viruses$seqtype)
keep_score archaea bacteria fungi plasmid protist virus
-4 146 129 4 63 0 0
-3 1841 7482 448 411 5 0
-2 6589 41981 305 1486 135 15
-1.5 574 2186 138 705 39 41
-1 192 3839 247 1120 35 24
-0.5 442 2341 392 360 3385 131
0 56 2451 56 400 382 1181
0.5 31 1560 53 186 1019 2849
1 66 1845 0 220 0 3184
1.5 4 903 0 39 0 2575